from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import pandas as pd

# 访问“监管问询”页面
browser = webdriver.Chrome()
browser.maximize_window()
browser.get('http://www.sse.com.cn/disclosure/credibility/supervision/inquiries/')

# 给出爬取的页数
max_page = 3

# 开始多页爬取
data_list = []
for page in range(1, max_page + 1):

    # 获取网页源代码
    time.sleep(3)
    html = browser.page_source

    # 从网页源代码中提取问询函标题和文件网址
    title_pattern = '<a class="table_titlewrap" href=".*?" target="_blank">(.*?)</a>'
    title_list = re.findall(title_pattern, html, re.S)
    url_pattern = '<a class="table_titlewrap" href="(.*?)" target="_blank">.*?</a>'
    url_list = re.findall(url_pattern, html, re.S)
    data = {'问询函标题': title_list, '文件网址': url_list}
    data = pd.DataFrame(data)
    data_list.append(data)

    # 单击“下一页”按钮进行翻页
    if page < max_page:
        next_page = browser.find_element(By.CSS_SELECTOR, 'li.next > a')
        next_page.click()
browser.quit()

# 进行数据的合并、清洗和导出
df = pd.concat(objs=data_list, ignore_index=True)
df['问询函标题'] = df['问询函标题'].str.replace(pat=r'[\\/:*?"<>|]', repl='', regex=True)
df.to_excel('上交所问询函.xlsx', index=False)
